# install.packages("readxl")
library(readxl)
source("https://bioconductor.org/biocLite.R")
Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
A new version of Bioconductor is available after installing the most recent version of R; see http://bioconductor.org/install
biocLite("Biobase")
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.3 (2017-11-30).
Installing package(s) 'Biobase'
trying URL 'https://bioconductor.org/packages/3.6/bioc/bin/macosx/el-capitan/contrib/3.4/Biobase_2.38.0.tgz'
Content type 'application/x-gzip' length 2157954 bytes (2.1 MB)
==================================================
downloaded 2.1 MB
# Version info: R 3.2.3, Biobase 2.30.0, GEOquery 2.40.0, limma 3.26.8
# R scripts generated Fri Aug 31 00:53:50 EDT 2018
# Server: www.ncbi.nlm.nih.gov
# Query: acc=GSE45120&platform=GPL6244&type=txt&groups=ZR-75-1|ZR-75-30&colors=dfeaf4|f4dfdf&selection=XXXXXX&padj=fdr&logtransform=auto&columns=ID&columns=adj.P.Val&columns=P.Value&columns=t&columns=B&columns=logFC&columns=Gene+symbol&columns=Gene+title&num=250&annot=ncbi
# Unable to generate script analyzing differential expression.
# Invalid input: at least two groups of samples should be selected.
################################################################
# Boxplot for selected GEO samples
library(Biobase)
library(GEOquery)
# load series and platform data from GEO
canadas_data <- paste(getwd(),"../..","datasets","canadas_2018/",sep = "/")
gset_canadas0 <- getGEO("GSE45120",destdir = canadas_data)
if (length(gset_canadas0) > 1) idx <- grep("GPL6244", attr(gset, "names")) else idx <- 1
gset_canadas <- gset_canadas0[[idx]]
# # set parameters and draw the plot
# palette(c("#dfeaf4","#f4dfdf", "#AABBCC"))
# dev.new(width=4+dim(gset)[[2]]/5, height=6)
# par(mar=c(2+round(max(nchar(sampleNames(gset)))/2),4,2,1))
# title <- paste ("GSE45120", '/', annotation(gset), " selected samples", sep ='')
# boxplot(exprs(gset), boxwex=0.7, notch=T, main=title, outline=FALSE, las=2)
# log2 transform
exfull <- exprs(gset_canadas)
# remove NAs
gset_canadas_trsf <- gset_canadas
qxfull <- as.numeric(quantile(exfull, c(0., 0.25, 0.5, 0.75, 0.99,1.0), na.rm=T))
LogCfill <- (qxfull[5] > 100) ||
(qxfull[6]-qxfull[1] > 50 && qxfull[2] > 0) ||
(qxfull[2] > 0 && qxfull[2] < 1 && qxfull[4] > 1 && qxfull[4] < 2)
# invoke this if log2 normalization isn't already in effect
if (LogCfill) {
print("log normalizing")
exfull[which(exfull <= 0)] <- NaN
exprs(gset_canadas_trsf) <- log2(exfull)
} else if (qxfull[6] > 100) {
# if the qxfull[6] full is extremely high...
print("adjusting values")
# remove extremely high values; since these are log2 fold changes, we don't want anything insanely high
exfull[which(exfull >= 10)] <- NA
# also remove extremely low values
exfull[which(exfull < -10)] <- NA
exprs(gset_canadas_trsf) <- exfull
}
# remove all rows that are all NA's
gset_canadas_trsf_with_NA <- apply(X = gset_canadas_trsf,
MARGIN = 1,
function(x) {
#length(x[is.na(x)]) > 1
length(x[is.na(x)]) == length(x)
})
gset_canadas_trsf_with_NA <- gset_canadas_trsf_with_NA[gset_canadas_trsf_with_NA]
The data came from a one-channel microarray
print(gset_canadas$description)
This is a pretty simple experimental setup: two conditions, one H69M and one H69 (both cell lines). We should consider how lung-cancer inflammation responses vary from EOC, Breast, or Colon cancer responses (as in Li et al 2014 and Chiappinelli et al 2015)
We will now fit the model. This doesn’t appear to be log2-normalized, so we will do so here
exprs(gset_canadas_trsf) <- log2(exfull)
lmfit_canadas <- lmFit(gset_canadas_trsf, design_canadas)
Extract necessary statistics and the differentially-expressed genes
alpha <- 0.05
log2fc_lim <- 0.5
canadas_cont_matrix <- makeContrasts(H69M-H69, levels=design_canadas)
lmfit_canadas_fit1 <- contrasts.fit(lmfit_canadas, canadas_cont_matrix)
lmfit_canadas_ebayes <- eBayes(fit = lmfit_canadas_fit1,proportion = alpha)
tT_canadas <- topTable(lmfit_canadas_ebayes,
adjust="fdr",
sort.by="B", number=nrow(gset_canadas_trsf))
# now for the frustrating business of extracting the names of different genes
tT_canadas_gene_names <- sapply(tT_canadas$gene_assignment,
function(x) {
a <- unlist(strsplit(x = as.character(x),split = " // ",fixed = TRUE))
a[2]
})
tT_canadas$GENE_SYMBOL <- as.character(tT_canadas_gene_names)
tT_canadas_subset <- subset(tT_canadas, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GENE_SYMBOL","AveExpr"))
# # fit1 <- contrasts.fit(fit,cont.matrix)
# fit2 <- contrasts.fit(fit_bycell, cont.matrix)
# # fit1 <- eBayes(fit1, 1)
# fit2 <- eBayes(fit2, 0.01)
# tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
# # allT <- topTable(fit2, adjust="none", sort.by="B",number = nrow(fit2))
# tT <- topTable(fit2, adjust="fdr", sort.by="B",number = nrow(fit2))
Volcano plot of the results
plot(tT_canadas_subset$logFC,
-log10(tT_canadas_subset$adj.P.Val),
xlab=c("log2FC (H69M-H69)"),
ylab="adj.P.val",
pch=20)
tT_canadas_sig_up_h69m <- subset(tT_canadas_subset,adj.P.Val <= alpha & logFC > log2fc_lim)
tT_canadas_sig_down_h69m <- subset(tT_canadas_subset,adj.P.Val <= alpha & logFC < -log2fc_lim)
points(tT_canadas_sig_up_h69m$logFC,
-log10(tT_canadas_sig_up_h69m$adj.P.Val),
pch=20,
col="red")
#col=as.factor(tT_canadas_subset$adj.P.Val <= alpha & tT_canadas_subset$logFC >= log2fc_lim))
abline(h = -log10(alpha),
v = log2fc_lim,
lty=2,
col="red")
# IFNB has a fold-change in expression of 0.2
Canadas et al focused on the 452 most differentially-expressed genes in their analysis (which they obtained from an earlier paper, Canadas et al Clin Cancer Res 2014). The genes are provided in Supplementary Dataset 1, though the exact method to reproduce them is not given.
read_excel_allsheets <- function(filename, tibble = FALSE) {
# from https://stackoverflow.com/questions/12945687/read-all-worksheets-in-an-excel-workbook-into-an-r-list-with-data-frames
sheets <- excel_sheets(filename)
x <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X))
if(!tibble) x <- lapply(x, as.data.frame)
names(x) <- sheets
x
}
canadas_topgenes_sem <- read_excel_allsheets(paste(canadas_data,"41591_2018_116_MOESM3_ESM.xlsx",sep = "/"))
canadas_topgenes_sem_up <- canadas_topgenes_sem$`S2_up_down genes H69M vs H69`$`H69M-UP`
canadas_topgenes_sem_down <- canadas_topgenes_sem$`S2_up_down genes H69M vs H69`$`H69M-DOWN`
Replot the volcano but with the specific genes marked
plot(tT_canadas_subset$logFC,
-log10(tT_canadas_subset$adj.P.Val),
xlab=c("log2FC (H69M-H69)"),
ylab="adj.P.val",
pch=20)
# tT_canadas_sig_up_h69m <- subset(tT_canadas_subset,adj.P.Val <= alpha & logFC > log2fc_lim)
# tT_canadas_sig_down_h69m <- subset(tT_canadas_subset,adj.P.Val <= alpha & logFC < -log2fc_lim)
points(tT_canadas_sig_up_h69m$logFC,
-log10(tT_canadas_sig_up_h69m$adj.P.Val),
pch=20,
col="red")
#col=as.factor(tT_canadas_subset$adj.P.Val <= alpha & tT_canadas_subset$logFC >= log2fc_lim))
tT_canadas_sig_up_h69m_in_sem <- subset(tT_canadas_sig_up_h69m,GENE_SYMBOL %in% canadas_topgenes_sem_up)
tT_canadas_sig_up_in_sem <- subset(tT_canadas_subset,GENE_SYMBOL %in% canadas_topgenes_sem_up)
#subset(tT_canadas_subset,GENE_SYMBOL %in% canadas_topgenes_sem_up | GENE_SYMBOL %in% canadas_topgenes_sem_down)
points(tT_canadas_sig_up_in_sem$logFC,
-log10(tT_canadas_sig_up_in_sem$adj.P.Val),
pch=20,
col="orange")
abline(h = -log10(alpha),
v = log2fc_lim,
lty=2,
col="red")
Heatmap:
# gene annotation
gene_annot <- data.frame(sig_up_h69m = as.factor(tT_canadas_subset$GENE_SYMBOL %in% tT_canadas_sig_up_h69m$GENE_SYMBOL),
sig_up_h69m_and_sem = as.factor(tT_canadas_subset$GENE_SYMBOL %in% tT_canadas_sig_up_h69m_in_sem$GENE_SYMBOL),
sig_up_sem = as.factor(tT_canadas_subset$GENE_SYMBOL %in% tT_canadas_sig_up_in_sem$GENE_SYMBOL))
rownames(gene_annot) <- tT_canadas_subset$ID
# sample annotation
sample_annot <- gset_canadas_meta
rownames(sample_annot) <- colnames(exprs(gset_canadas_trsf))
# pare down the expression matrix to a desired subset
exp_full <- exprs(gset_canadas_trsf)
gset_canadas_trsf_hmap_sig <- exp_full[rownames(exp_full) %in% rownames(tT_canadas_sig_up_h69m),]
gene_annot_sig <- gene_annot[rownames(gene_annot) %in% rownames(gset_canadas_trsf_hmap_sig),]
# heatmap
pheatmap(gset_canadas_trsf_hmap_sig,
annotation_col = sample_annot,
annotation_row = gene_annot_sig,
show_rownames = FALSE,
show_colnames = FALSE)
# how many genes are in theirs but not mine?
# genes that are significant to them but not to me
gene_annot_up_sem_uniq <- subset(gene_annot,sig_up_sem == TRUE & sig_up_h69m == FALSE)
# match the canadas-unique probes to genes
gene_annot_up_sem_uniq_genes <- subset(tT_canadas_subset,ID %in% rownames(gene_annot_up_sem_uniq))
gene_annot_up_sem_uniq.n_genes <- unique(gene_annot_up_sem_uniq_genes$GENE_SYMBOL)
print(paste("They report",nrow(gene_annot_up_sem_uniq),"probes representing",length(gene_annot_up_sem_uniq.n_genes),"genes as significantly-upregulated in H69M that I do not"))
[1] "They report 116 probes representing 112 genes as significantly-upregulated in H69M that I do not"
gene_annot_up_mine_uniq <- tT_canadas_sig_up_h69m[!tT_canadas_sig_up_h69m$ID %in% tT_canadas_sig_up_in_sem$ID,]
print(paste("I report",nrow(gene_annot_up_mine_uniq),"probes representing",length(unique(gene_annot_up_mine_uniq$GENE_SYMBOL))," genes as significantly-upregulated in H69M that they do not"))
[1] "I report 45 probes representing 38 genes as significantly-upregulated in H69M that they do not"
Fair concordance between my analysis and theirs.
Save my signatures
write.table(x = tT_canadas_sig_up_in_sem,file = "./tT_canadas_sig_up_in_sem.txt",row.names = TRUE,col.names = TRUE,sep = "\t",quote = FALSE)
write.table(x = tT_canadas_sig_up_h69m,file = "./tT_canadas_sig_up_h69m.txt",row.names = TRUE,col.names = TRUE,sep = "\t",quote = FALSE)
write.table(x = tT_canadas_sig_up_h69m_in_sem,file = "./tT_canadas_sig_up_h69m_in_sem.txt",row.names = TRUE,col.names = TRUE,sep = "\t",quote = FALSE)
Prepare gsva
prepare_gsva()
Gene set imports
# import some gene sets for GSVA
kegg_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/msigdb_c2.cp.kegg_hsapiens.gmt")
bp_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/msigdb_c5.bp_hsapiens.gmt")
cancer_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/msigdb_c6_hsapiens.gmt")
immune_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/msigdb_c7_hsapiens.gmt")
# from the Li et al oncotarget 2014 paper
interferon_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/custom/msigdb_interferon.gmt")
cytochemokine_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/custom/msigdb_cytochemokine.gmt")
inflammation_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/custom/msigdb_inflammation.gmt")
antigen_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/custom/msigdb_antigen.gmt")
# virus-related terms
virus_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/custom/msigdb_virus.gmt")
# hallmark sets--verified in microarrays
hallmark_genesets <- import_gmt(gmtfile = "../../referenceAnnot/genesets/hg38/msigdb_hallmark_hsapiens.gmt")
Get counts
tT_canadas_sig_up_h69m_expr <- exprs(gset_canadas_trsf)[rownames(exprs(gset_canadas_trsf)) %in% tT_canadas_sig_up_h69m$ID,]
tT_canadas_sig_up_h69m_expr_genes <- sapply(rownames(tT_canadas_sig_up_h69m_expr),
function(x) {
a <- subset(tT_canadas_sig_up_h69m,ID %in% x)
a[,"GENE_SYMBOL"]
})
# remove NA probes
tT_canadas_sig_up_h69m_expr <- tT_canadas_sig_up_h69m_expr[!is.na(tT_canadas_sig_up_h69m_expr_genes),]
tT_canadas_sig_up_h69m_expr_genes <- tT_canadas_sig_up_h69m_expr_genes[!is.na(tT_canadas_sig_up_h69m_expr_genes)]
# ids
tT_canadas_sig_up_h69m_expr_ids <- rownames(tT_canadas_sig_up_h69m_expr)
rownames(tT_canadas_sig_up_h69m_expr) <- tT_canadas_sig_up_h69m_expr_genes
# perform GSVA with KEGG
canadas_kegg_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = kegg_genesets)
# perform GSVA with Cancer pathways
canadas_cancer_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = cancer_genesets)
# perform GSVA with Immune pathways
canadas_immune_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = immune_genesets)
# perform GSVA with Immune pathways
canadas_bp_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = bp_genesets)
# perform GSVA with GO terms from Li et al Oncotarget
canadas_interferon_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = interferon_genesets,
ikcdf = "Gaussian",method=c("ssgsea"))
canadas_cytochemokine_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = cytochemokine_genesets,
ikcdf = "Gaussian",method=c("ssgsea"))
canadas_inflammation_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = inflammation_genesets,
ikcdf = "Gaussian",method=c("ssgsea"))
canadas_antigen_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = antigen_genesets,
ikcdf = "Gaussian",method=c("ssgsea"))
# perform GSVA with viral terms
canadas_virus_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = virus_genesets,
ikcdf = "Gaussian",method=c("ssgsea"))
# perform GSVA with hallmark terms
canadas_hallmark_enrich <- gsva_enrich(counts = tT_canadas_sig_up_h69m_expr,
geneset = hallmark_genesets,
ikcdf = "Gaussian",method=c("ssgsea"))
# heatmaps for pathways
pheatmap(canadas_kegg_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_kegg_enrich),
show_colnames = FALSE,
fontsize = 2.5,
main = "KEGG")
pheatmap(canadas_cancer_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_cancer_enrich),
show_colnames = FALSE,
fontsize = 2.5,
main = "Cancer pathways (general)")
pheatmap(canadas_immune_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_immune_enrich),
show_colnames = FALSE,
fontsize = 2.5,
main = "Immune pathways")
pheatmap(canadas_bp_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_bp_enrich),
show_colnames = FALSE,
fontsize = 2.5,
main = "GO Biological Processes")
# heatmaps for li et al oncotarget signatures
pheatmap(canadas_interferon_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_interferon_enrich),
show_colnames = FALSE,
cluster_cols = FALSE,
fontsize = 2.5,
main = "Interferon GO Terms")
pheatmap(canadas_cytochemokine_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_cytochemokine_enrich),
show_colnames = FALSE,
fontsize = 5.0,
cluster_cols = FALSE,
main = "Cytokine/Chemokine GO Terms")
pheatmap(canadas_inflammation_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_inflammation_enrich),
cluster_rows = FALSE,
cluster_cols = FALSE,
show_colnames = FALSE,
fontsize = 5.0,
main = "Inflammation GO terms")
pheatmap(canadas_antigen_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_antigen_enrich),
show_rownames = TRUE,
fontsize = 4.0,
show_colnames = FALSE,
cluster_cols = FALSE,
main = "Antigen Presentation and Receptor GO Terms")
pheatmap(canadas_virus_enrich,
annotation_col = gset_canadas_meta,
labels_row = rownames(canadas_virus_enrich),
show_rownames = TRUE,
show_colnames = FALSE,
cluster_cols = FALSE,
fontsize_row = 2.5,
main = "Virus GO Terms")